# Required Packages
import pandas as pd
import numpy as np
# Modeling
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
# preprocessing
from sklearn.impute import SimpleImputer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## progress bar
import progressbar
## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('white')
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%config InlineBackend.figure_format = 'retina'
## missingno
import missingno as msno
import warnings
warnings.filterwarnings("ignore")
![]()
In this article, we investigate a set simulated dataset that mimics customer behavior on the Starbucks rewards mobile app. Starbucks tends to send out offers to users of the mobile app once every few days. These offers are exclusive, that is not all users receive the same offer. An offer can contain a discount for their products or sometimes BOGO (buy one get one free). These offers have a validity period before the offer expires. The article here is inspired by a towardsdatascience.com article.
def Line(N): return N*'='
def Header(Inp, Length = 120):
print(Back.BLACK + Fore.CYAN + Style.NORMAL + '%s' % Inp + Style.RESET_ALL
+ Fore.BLUE + Style.NORMAL + ' %s' % Line(Length- len(Inp) - 1) + Style.RESET_ALL)
def Bottom(Length = 120):
print(Fore.BLUE + Style.NORMAL + '%s' % Line(Length) + Style.RESET_ALL)
# Portfolio Dataset
Header('Portfolio Dataset:')
Portfolio = pd.read_csv('StarBucks/Portfolio_Clean.csv')
display(Portfolio.head().style.hide_index())
# Profile Dataset
Header('Profile Dataset:')
Profile = pd.read_csv('StarBucks/Profile_Clean.csv')
display(Profile.head().style.hide_index())
# Transcript Dataset
Header('Transcript Dataset:')
Transcript = pd.read_csv('StarBucks/Transcript_Clean.csv')
display(Transcript.head().style.hide_index())
Bottom()
User_Data = pd.read_csv('StarBucks/User_Data.csv')
Data = pd.read_csv('StarBucks/Data.csv')
The object of the exercise is determining the best offer type for a given user. This can be done via a classification method that provides a probability as well. Here we use the sklearn MultiOutputClassifier with RandomForestClassifier for our modeling.
User_Data = User_Data.drop(['No_Offer','BOGO_comp','Info_comp','Disc_comp',
'Tot_Rewards_Rec','Offer_Difficulty'], axis=1)
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .5})
bottom, top = ax.get_ylim()
Correlation_Plot(User_Data,14)
Target = {'BOGO_offer':'BOGO Offers', 'Disc_offer': 'Discount Offers','Info_offer':'Informational Offers'}
X= User_Data.drop(columns = list(Target.keys()))
y = User_Data[list(Target.keys())]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
# Random Forest Classifier using 100 estimators
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, oob_score=True))
_ = clf.fit(X_train, y_train)
# Predictions
y_pred = clf.predict(X_test)
display(pd.DataFrame(metrics.classification_report(y_test,clf.predict(X_test),
output_dict = True, target_names = Target)).round(4))
Feat_Dict = {'BOGO_Offer_Rec':'BOGO Offer Received', 'Difficulty_per_Offer':'Difficulty per Offer',
'Disc_Offer_Rec':'Discount Offer Received', 'Gender_Female':'Gender: Female',
'Gender_Male':'Gender: Male', 'Gender_Other':'Gender: Other', 'Info_Offer_Rec':'Informational Offer Received',
'Member_Tenure':'Member Tenure', 'Offer_Comp_Rec_Ratio': 'Offer Completed Receive Ratio',
'Offer_Comp_View_Ratio':'Viewed Offer Completed Ratio', 'Offer_Tran_Cnt_Ratio':'Offer Transaction Count Ratio' ,
'Offer_Trans_Amnt':'Offer Transaction Amount', 'Offer_Trans_Amnt_Ratio':'Offer Transaction Amount Ratio',
'Offer_View': 'Viewed Offer', 'Reward_per_Offer': 'Reward per Offer',
'Tot_Tran_Amnt':'Total Transaction Amount', 'Tot_Tran_Cnt':'otal Transaction Count',
'Tran_Amnt_per_Offer':'Transactions Amount per Offer','offer_comp': 'Offer Completed'}
Temp = list(Target.values())
Results = pd.DataFrame()
for i in range(len(Temp)):
Temp0 = pd.DataFrame(data = clf.estimators_[i].feature_importances_, index = X_train.columns, columns = ['Importance'])
Temp0['Target'] = Temp[i]
Results = Results.append(Temp0)
del Temp0
Results = Results.reset_index(drop = False).rename(columns = {'index':'Features'})
Temp = pd.pivot_table(Results, values='Importance', index=['Features'], aggfunc=np.mean, fill_value=0).reset_index(drop = False)
Temp['Target'] = 'Overall'
Results = pd.concat([Results, Temp])
del Temp
Results['Features'] = Results['Features'].replace(Feat_Dict)
Colors = ['LightBlue', 'DeepSkyBlue', 'CornFlowerBlue', 'OrangeRed']
fig = px.bar(Results, x='Features', y='Importance', orientation='v',
color = 'Target', color_discrete_sequence= Colors, barmode='group', height= 600)
fig.update_traces(marker_line_color= 'navy', marker_line_width=0.8, opacity=1)
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.2, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white',
legend_title_text=None, legend=dict(x=0,y=1.1, bordercolor="Black", borderwidth=1))
fig['layout']['yaxis'].update(range=[0, .3])
fig.update_layout(title={'text': 'Classification Feature Importance',
'x':0.5, 'y':1,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Our model can predict multiple offer types for a given customer, and then sort the recommendations based on a higher probability of conversion. Then, we can find the most suitable offer for the customer.
def Best_Offer (ID, Model=clf):
# Input: ID
# all avialable offers
Offers = [x.replace(' Offers','') for x in Target.values()]
# the probability for given user(s)
Predicted_Prop = clf.predict_proba(X.loc[ID])
# Best offers for given user(s)
Class_Predicted_Prop = clf.predict(X.loc[ID])
# Prediccted probility for each offer type
Offer_Predicted_Prop = [[Predicted_Prop[c][i][1] for c in range(len(Offers)) ] for i in range(len(ID))]
Offer_Predicted_Prop = np.array(Offer_Predicted_Prop)
# Best_Offer_List
Best_Offer_List = []
for user in range(len(Offer_Predicted_Prop)):
if Class_Predicted_Prop[user].sum()>0:
# the index where predicted offer = 1
Predicted_Class_ID = np.argwhere(Class_Predicted_Prop[user]==1).flatten()
Prop_ID_Sort = np.argsort(-Offer_Predicted_Prop[user])
# sorting probability in descending order to pick the most suitable one
Best_Index = [i for i in Prop_ID_Sort if i in Predicted_Class_ID]
Best_Offer_List.append([Offers[i] for i in Best_Index])
else:
Best_Offer_List.append('No offer is recommended')
return pd.DataFrame(data={'Customers': Data['Person'].loc[ID].values, 'Most Suitable Offers': Best_Offer_List})
def Estimated_Probabilities(ID, Model=clf):
Prop = pd.DataFrame()
for i in range(len(Target.values())):
Prop = pd.concat([Prop, pd.DataFrame(clf.predict_proba(X.loc[ID])[i])], axis =1)
Temp = []
for x in [x + ' Probability' for x in Target.values()]:
Temp.append(x)
Temp.append(x)
header = [np.array(Temp, dtype = str),
np.array([0,1]*3)]
Prop = pd.DataFrame(Prop.values, columns = header )
Prop.index = Data['Person'].loc[ID].values
return Prop
For example consider a random list of ten customers. We have,
n=10
ID = np.random.choice(X_test.index, n)
display(Best_Offer (ID))
display(Estimated_Probabilities(ID))